In this notebook we see how well we can reproduce Kd from simulated experimental data.

In this notebook we play with data generated in the 'Just Modeling - Two Component Binding' notebook, and see how our bayesian models do at reproducing the Kd.


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pymc
import seaborn as sns

%pylab inline


Populating the interactive namespace from numpy and matplotlib

The setup.

We use the same setup here as we did in 'Just Modeling - Two Component Binding'.

Experimentally we won't know the Kd, but we know the P, PL, and L concentrations.


In [2]:
Kd = 2e-9 # M

In [3]:
Ptot = 1e-9 # M

In [4]:
Ltot = 20.0e-6 / np.array([10**(float(i)/2.0) for i in range(12)]) # M

In [5]:
def two_component_binding(Kd, Ptot, Ltot):
    """
    Parameters
    ----------
    Kd : float
        Dissociation constant
    Ptot : float
        Total protein concentration
    Ltot : float
        Total ligand concentration
        
    Returns
    -------
    P : float
        Free protein concentration
    L : float
        Free ligand concentration
    PL : float
        Complex concentration
    """
                                    
    PL = 0.5 * ((Ptot + Ltot + Kd) - np.sqrt((Ptot + Ltot + Kd)**2 - 4*Ptot*Ltot))  # complex concentration (uM)
    P = Ptot - PL; # free protein concentration in sample cell after n injections (uM)                                                                                                                                                                                                                          
    L = Ltot - PL; # free ligand concentration in sample cell after n injections (uM)                                                                                                                                                                                                                           
    return [P, L, PL]

[L, P, PL] = two_component_binding(Kd, Ptot, Ltot)

In [6]:
# y will be complex concentration
# x will be total ligand concentration
plt.semilogx(Ltot,PL, 'o')
plt.xlabel('$[L]_{tot}$ / M')
plt.ylabel('$[PL]$ / M')
plt.ylim(0,1.3e-9)
plt.axhline(Ptot,color='0.75',linestyle='--',label='$[P]_{tot}$')
plt.legend();


Now make this a fluorescence experiment.


In [7]:
# Making max 400 relative fluorescence units, and scaling all of PL to that
npoints = len(Ltot)
sigma = 10.0 # size of noise
F_i = (400/1e-9)*PL + sigma * np.random.randn(npoints)
Pstated = np.ones([npoints],np.float64)*Ptot
Lstated = Ltot

In [8]:
# Uncertainties in protein and ligand concentrations.
dPstated = 0.10 * Pstated # protein concentration uncertainty
dLstated = 0.08 * Lstated # ligand concentraiton uncertainty (due to gravimetric preparation and HP D300 dispensing)

The test.


In [9]:
# Define our two-component binding system again.
def two_component_binding(DeltaG, P, L):
    Kd = np.exp(DeltaG)
    PL = 0.5 * ((P + L + Kd) - np.sqrt((P + L + Kd)**2 - 4*P*L));  # complex concentration (M)                                                                                                                                                                                                         
    P = P - PL; # free protein concentration in sample cell after n injections (M)                                                                                                                                                                                                                          
    L = L - PL; # free ligand concentration in sample cell after n injections (M)                                                                                                                                                                                                                           
    return [P, L, PL]

In [10]:
# Create a pymc model
def make_model(Pstated, dPstated, Lstated, dLstated, Fobs_i):
    N = len(Lstated)
    # Prior on binding free energies.
    DeltaG = pymc.Uniform('DeltaG', lower=-40, upper=+40, value=0.0) # binding free energy (kT), uniform over huge range
        
    # Priors on true concentrations of protein and ligand.
    Ptrue = pymc.Normal('Ptrue', mu=Pstated, tau=dPstated**(-2)) # protein concentration (M)
    Ltrue = pymc.Normal('Ltrue', mu=Lstated, tau=dLstated**(-2)) # ligand concentration (M)
    Ltrue_control = pymc.Normal('Ltrue_control', mu=Lstated, tau=dLstated**(-2)) # ligand concentration (M)

    # Priors on fluorescence intensities of complexes (later divided by a factor of Pstated for scale).
    Fmax = Fobs_i.max()
    F_background = pymc.Uniform('F_background', lower=0.0, upper=Fmax) # background 
    F_PL = pymc.Uniform('F_PL', lower=0.0, upper=Fmax/min(Pstated.max(),Lstated.max())) # complex fluorescence
    F_L = pymc.Uniform('F_L', lower=0.0, upper=Fmax/Lstated.max()) # ligand fluorescence

    # Unknown experimental measurement error.
    log_sigma = pymc.Uniform('log_sigma', lower=-10, upper=3, value=0.0) 
    @pymc.deterministic
    def precision(log_sigma=log_sigma): # measurement precision
        return np.exp(-2*log_sigma)

    # Fluorescence model.
    @pymc.deterministic
    def Fmodel(F_background=F_background, F_PL=F_PL, F_L=F_L, Ptrue=Ptrue, Ltrue=Ltrue, DeltaG=DeltaG):
        Fmodel_i = np.zeros([N])
        for i in range(N):
            [P, L, PL] = two_component_binding(DeltaG, Ptrue[i], Ltrue[i])
            Fmodel_i[i] = (F_PL*PL + F_L*L) + F_background
        return Fmodel_i
    
    # Experimental error on fluorescence observations.
    Fobs_model = pymc.Normal('Fobs_i', mu=Fmodel, tau=precision, size=[N], observed=True, value=Fobs_i) # observed data
    
    # Construct dictionary of model variables.
    pymc_model = { 'Ptrue' : Ptrue, 
                   'Ltrue' : Ltrue, 
                   'Ltrue_control' : Ltrue_control, 
                   'log_sigma' : log_sigma, 
                   'precision' : precision, 
                   'F_PL' : F_PL, 
                   'F_L' : F_L, 
                   'F_background' : F_background,
                   'Fmodel_i' : Fmodel,
                   'Fobs_model' : Fobs_model, 
                   'DeltaG' : DeltaG # binding free energy
                   }
    return pymc_model

In [11]:
# Build model.
pymc_model = pymc.Model(make_model(Pstated, dPstated, Lstated, dLstated, F_i))

# Sample with MCMC
mcmc = pymc.MCMC(pymc_model, db='ram', name='Sampler', verbose=True)
mcmc.sample(iter=100000, burn=10000, thin=50, progress_bar=False)

In [12]:
# Plot trace of DeltaG.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.DeltaG.trace(), 'o');
xlabel('MCMC sample');
ylabel('$\Delta G$ / $k_B T$');



In [13]:
# Plot trace of true protein concentration.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.Ptrue.trace()*1e6, 'o');
xlabel('MCMC sample');
ylabel('$[P]_{tot}$ / $\mu$M');



In [14]:
# Plot trace of true protein concentration.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.Ltrue.trace()*1e6, 'o');
xlabel('MCMC sample');
ylabel('$[L]_{tot}$ ($\mu$M)');
print mcmc.Ltrue.trace().min()


4.57991116607e-11

In [15]:
# Plot histogram of DeltaG.
rcParams['figure.figsize'] = [15, 3]
hist(mcmc.DeltaG.trace()[-1000:], 40);
xlabel('$\Delta G$ / $k_B T$');
ylabel('$P(\Delta G)$');



In [16]:
# Plot trace of intrinsic fluorescence parameters.
rcParams['figure.figsize'] = [15, 3]
semilogy(mcmc.F_PL.trace(), 'o', mcmc.F_L.trace(), 'o', mcmc.F_background.trace(), 'o');
legend(['complex fluorescence', 'ligand fluorescence', 'background fluorescence']);
xlabel('MCMC sample');
ylabel('relative fluorescence intensity');



In [17]:
# Plot model fit.
rcParams['figure.figsize'] = [15, 3]
figure = pyplot.gcf() # get current figure
Fmodels = mcmc.Fmodel_i.trace()
clf();
hold(True)
for Fmodel in Fmodels:
    semilogx(Lstated, Fmodel, 'k-')
semilogx(Lstated, F_i, 'ro')
hold(False)
xlabel('$[L]_{tot}$ / M');
ylabel('fluorescence units');


Did it work?


In [18]:
nlast = 500 # number of final samples to analyze
DeltaG = mcmc.DeltaG.trace()[-nlast:].mean()
dDeltaG = mcmc.DeltaG.trace()[-nlast:].std()
print "DeltaG: %.3f +- %.3f kT" % (DeltaG, dDeltaG)


DeltaG: -19.855 +- 0.244 kT

In [19]:
Kd_calc = np.exp(mcmc.DeltaG.trace()[-nlast:]).mean()
dKd_calc = np.exp(mcmc.DeltaG.trace()[-nlast:]).std()
print "Kd = %.3f +- %.3f nM [true: %.3f nM]" % (Kd_calc/1e-9, dKd_calc/1e-9, Kd/1e-9)


Kd = 2.453 +- 0.593 nM [true: 2.000 nM]

In [20]:
relative_error = np.abs(Kd_calc-Kd) / np.abs(Kd)
print "Relative error in Kd is %.5f %%" % (relative_error * 100)


Relative error in Kd is 22.65172 %
Basically we modeled data for a Kd of 2 nM, and with Bayes even with ideal data, it still thought that the Kd was 0.9 nM.

Can we get a better result just by improving our data?

Let's make a 'better' set of data.


In [21]:
def make_two_component_binding(Kd, Ptot, Ltot):
                   
    PL = 0.5 * ((Ptot + Ltot + Kd) - np.sqrt((Ptot + Ltot + Kd)**2 - 4*Ptot*Ltot))  # complex concentration (uM)
    P = Ptot - PL; # free protein concentration in sample cell after n injections (uM)                                                                                                                                                                                                                          
    L = Ltot - PL; # free ligand concentration in sample cell after n injections (uM)                                                                                                                                                                                                                           
    return [P, L, PL]

All we need to do make 'better' data is refine our Ligand range.


In [22]:
Lnew = 1.0e-7 / np.array([10**(float(i)/8.0) for i in range(24)])

In [23]:
[L, P, PL] = make_two_component_binding(2e-9,Ptot,Lnew)
print PL


[  9.80201901e-10   9.73689657e-10   9.65077126e-10   9.53720769e-10
   9.38807921e-10   9.19336626e-10   8.94116463e-10   8.61815458e-10
   8.21091654e-10   7.70854723e-10   7.10678887e-10   6.41298491e-10
   5.64963393e-10   4.85336843e-10   4.06790311e-10   3.33367532e-10
   2.67949192e-10   2.11958098e-10   1.65550780e-10   1.28032647e-10
   9.82696975e-11   7.49925717e-11   5.69806610e-11   4.31532559e-11]

In [24]:
# y will be complex concentration
# x will be total ligand concentration
plt.semilogx(Lnew,PL, 'o')
plt.xlabel('L')
plt.ylabel('PL')
plt.ylim(0,1.3e-9)
plt.axhline(Ptot,color='0.75',linestyle='--',label='[Ptot]')
plt.legend();


Great! Now let's see how pymc does.


In [25]:
# Making max 400 relative fluorescence units, and scaling all of PL to that
npoints = len(Lnew)
F_i = (400/1e-9)*PL + sigma * np.random.randn(npoints)
Pstated = np.ones([npoints],np.float64)*Ptot
Lstated = Lnew
dPstated = 0.10 * Pstated
dLstated = 0.08 * Lstated

In [26]:
# Build model.
pymc_model = pymc.Model(make_model(Pstated, dPstated, Lstated, dLstated, F_i))

# Sample with MCMC
mcmc = pymc.MCMC(pymc_model, db='ram', name='Sampler', verbose=True)
mcmc.sample(iter=100000, burn=10000, thin=50, progress_bar=False)

In [27]:
# Plot trace of true protein concentration.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.Ptrue.trace()*1e6, 'o');
xlabel('MCMC sample');
ylabel('$[P]_{tot}$ / $\mu$M');



In [28]:
# Plot trace of true protein concentration.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.Ltrue.trace()*1e6, 'o');
xlabel('MCMC sample');
ylabel('$[L]_{tot}$ ($\mu$M)');
print mcmc.Ltrue.trace().min()


9.51650314534e-11

In [29]:
# Plot trace of DeltaG.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.DeltaG.trace(), 'o');
xlabel('MCMC sample');
ylabel('$\Delta G$ ($k_B T$)');



In [30]:
# Plot histogram of DeltaG.
rcParams['figure.figsize'] = [15, 3]
hist(mcmc.DeltaG.trace(), 40);
xlabel('$\Delta G$ ($k_B T$)');
ylabel('$P(\Delta G)$');



In [31]:
# Plot trace of intrinsic fluorescence parameters.
rcParams['figure.figsize'] = [15, 3]
semilogy(mcmc.F_PL.trace(), 'o', mcmc.F_L.trace(), 'o', mcmc.F_background.trace(), 'o');
legend(['complex fluorescence', 'ligand fluorescence', 'background fluorescence']);
xlabel('MCMC sample');
ylabel('relative fluorescence intensity');



In [32]:
# Plot model fit.
rcParams['figure.figsize'] = [15, 3]
figure = pyplot.gcf() # get current figure
Fmodels = mcmc.Fmodel_i.trace()
clf();
hold(True)
for Fmodel in Fmodels:
    semilogx(Lstated, Fmodel, 'k-')
semilogx(Lstated, F_i, 'ro')
hold(False)
xlabel('$[L]_s$ (M)');
ylabel('fluorescence units');


Did it work?


In [33]:
nlast = 500 # number of final samples to analyze
DeltaG = mcmc.DeltaG.trace()[-nlast:].mean()
dDeltaG = mcmc.DeltaG.trace()[-nlast:].std()
print "DeltaG: %.3f +- %.3f kT" % (DeltaG, dDeltaG)

Kd_calc = np.exp(mcmc.DeltaG.trace()[-nlast:]).mean()
dKd_calc = np.exp(mcmc.DeltaG.trace()[-nlast:]).std()
print "Kd = %.3f +- %.3f nM [true: %.3f nM]" % (Kd_calc/1e-9, dKd_calc/1e-9, Kd/1e-9)

relative_error = np.abs(Kd_calc-Kd) / np.abs(Kd)
print "Relative error in Kd is %.5f %%" % (relative_error * 100)


DeltaG: -19.964 +- 0.153 kT
Kd = 2.161 +- 0.322 nM [true: 2.000 nM]
Relative error in Kd is 8.05510 %

In [ ]: